# This script identifies proxy SNPs for the candidate SNPs which were missing in the IOWBC 
# For this, LDlink's LDProxy tool was used. This uses the 1000G reference panel, but it is mapped to GRCh37 which is the hg19 one used in IOW and the UKB GWAS. 
# In the IOWBC, 23 SNPs were missing but only 21 are have proxy snps available to source in the GBR population. 
# 		# 1 snp (rs200634877) is not present in UKB or IOW. It is also not in the 1000G reference so a proxy cannot be found. 
		# 1 snp (rs1101999) is not present in UKB or IOW. It is monoallelic in the CEU and GBR pops so no proxies available. But proxies are available in other eurpoean pops like (TSI-Toscani in Italy and in IBS -  Iberian pop in Spain)
# The details of this is written into the excel file: Genomic_risk_score_features.xlsx, sheet: SNPs included in PRS
###########################################################################################################################################################################################################################
# Get list of UKB SNP IDs 
awk '{print $1,$2":"$3}' FERREIRA_UKB_CHILD_ONSET_ASTHMA.20180501.allchr.assoc.GC > UKB_Ferreira_CO_snplist_details_for_proxies
# 9020834 snps - info in this file is in the format of: SNP CHR:BP

# Get list of IOW SNP IDs 
awk '{print $2,$1":"$4}' IoW_F1_imputed_QCed_ukbsnpsrenamed_FLG.bim > IOW_F1_QCed_snplist_details_for_proxies
# 7236418 snps - info in this file is in the format of: SNP CHR:BP

#######################################################################################################################################################################
#######################################################################################################################################################################
# Proxy SNPs were identified using R, version 3.6.1

# Set working directory
setwd("/../../..")

# Import packages
library("devtools")
#devtools::install_github("CBIIT/LDlinkR") # version: LDlinkR_1.0.2.9000
library(LDlinkR) 

# Identify SNPs needing proxies
# Load list of 128 SNPs considered for the PRS
candidate_snps <- read.table("Curated_CEU_128_independent_snplist.txt", header=FALSE) 
# Load list of 128 SNPs considered for the PRS - load the list of PRS SNPs available in the IOWBC 
IOW_PRS_snps <- read.table("/scratch/dk2e18/IoW_Genotype_Data/PRS/Candidate_snps/candidate_snps_in_IOW", header=FALSE)

# Identify SNPs missing in the IOWBC and which therefore need proxy SNPs
'%ni%' <- Negate ('%in%')
snps_to_upload <- candidate_snps[candidate_snps$V1 %ni% IOW_PRS_snps$V2,]

# Source proxies from LDLink
LDproxy_batch(snp=snps_to_upload, pop = "GBR", r2d = "r2", token = "6ee8af5d119d", append = FALSE)

# Identify top proxy snp available in UKB and IOW
# Load information for SNPs available in IOWBC
target <- read.table("IOW_F1_QCed_snplist_details_for_proxies", header=FALSE)
# Edit column names/formatting to enable cross-referencing with data available from LDproxy
colnames(target) <- c("SNP","Coord")
target$Coordx <- paste("chr",target$Coord,sep="")
target$Coordx <- gsub("chr23", "chrX",target$Coordx)

# Load information for SNPs available in Ferreira et al.'s UKB GWAS summary statistics
base <- read.table("/scratch/dk2e18/IoW_Genotype_Data/PRS/Candidate_snps/Proxy_snps/UKB_Ferreira_CO_snplist_details_for_proxies", header=TRUE)
colnames(base) <- c("SNP","Coord")

# For each SNP needing a proxy SNP, identify if the SNP is present in the IOWBC and UKB GWAS. 
results <- NULL
ids <- dir(pattern="*.txt")
#ids <- "rs5758364.txt"

select_proxy <- function(data){
	if(length(which(data$available=="Yes"))>0){
		records <- subset(data, available=="Yes")
		first_record <-cbind(original_SNP, records[1,])
	} else if(length(which(data$available=="Yes"))<0){
		records <- subset(data, available=="No")
		first_record <-cbind(original_SNP, paste0("No proxy in IOW")) 
	} else {
	NULL
	}	
return(first_record)
}

# For each SNP with a potential proxy SNP available, identify the SNP in highest LD and closest to the original candidate SNP. 

for(i in ids){
	infile <- paste(i)
	original_SNP <- substr(i, 1, nchar(i)-4)
	outfile <- paste(original_SNP,"_checked.csv", sep="")
	print(infile)
	print(outfile)
	snp <- read.table(infile)
	snp$in_base <- ifelse((snp$RS_Number %in% base$SNP),"Yes", "No")
	snp$in_target <- ifelse((snp$RS_Number %in% target$SNP)|(snp$Coord %in% target$Coordx),"Yes", "No")
	snp$available <- ifelse(snp$in_base=="Yes" & snp$in_target=="Yes", "Yes", "No")
	newdata <- snp[order(-snp$R2, snp$Distance),]
	proxy <- select_proxy(newdata)
	results <- rbind(results,proxy)
	write.csv(newdata, paste(outfile), quote=F, row.names=F)
}

dim(results)
# 20 = all snps have proxies
x <- results[results$R2<0.80,]
9 have r2 <0.8 and so no proxies will be selected for these.
write.csv(results, "Top_GBR_proxy_snps_in_IOW.csv", quote=F, row.names=FALSE)
